{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pandas import Series, DataFrame\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "\n",
      "import sklearn\n",
      "from sklearn.ensemble import GradientBoostingClassifier\n",
      "from sklearn import preprocessing\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 133
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "air_raw = DataFrame.from_csv(\"allyears_tiny.csv\", index_col = False)\n",
      "print(air_raw.head())\n",
      "\n",
      "air_raw['RandNum'] = Series(np.random.uniform(size = len(air_raw['Origin'])))\n",
      "print(air_raw.head())\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \\\n",
        "0  1987     10          14          3      741         730      912   \n",
        "1  1987     10          15          4      729         730      903   \n",
        "2  1987     10          17          6      741         730      918   \n",
        "3  1987     10          18          7      729         730      847   \n",
        "4  1987     10          19          1      749         730      922   \n",
        "\n",
        "   CRSArrTime UniqueCarrier  FlightNum      ...       Cancelled  \\\n",
        "0         849            PS       1451      ...               0   \n",
        "1         849            PS       1451      ...               0   \n",
        "2         849            PS       1451      ...               0   \n",
        "3         849            PS       1451      ...               0   \n",
        "4         849            PS       1451      ...               0   \n",
        "\n",
        "   CancellationCode  Diverted  CarrierDelay  WeatherDelay  NASDelay  \\\n",
        "0               NaN         0           NaN           NaN       NaN   \n",
        "1               NaN         0           NaN           NaN       NaN   \n",
        "2               NaN         0           NaN           NaN       NaN   \n",
        "3               NaN         0           NaN           NaN       NaN   \n",
        "4               NaN         0           NaN           NaN       NaN   \n",
        "\n",
        "  SecurityDelay LateAircraftDelay  IsArrDelayed  IsDepDelayed  \n",
        "0           NaN               NaN           YES           YES  \n",
        "1           NaN               NaN           YES            NO  \n",
        "2           NaN               NaN           YES           YES  \n",
        "3           NaN               NaN            NO            NO  \n",
        "4           NaN               NaN           YES           YES  \n",
        "\n",
        "[5 rows x 31 columns]\n",
        "   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \\\n",
        "0  1987     10          14          3      741         730      912   \n",
        "1  1987     10          15          4      729         730      903   \n",
        "2  1987     10          17          6      741         730      918   \n",
        "3  1987     10          18          7      729         730      847   \n",
        "4  1987     10          19          1      749         730      922   \n",
        "\n",
        "   CRSArrTime UniqueCarrier  FlightNum    ...     CancellationCode  Diverted  \\\n",
        "0         849            PS       1451    ...                  NaN         0   \n",
        "1         849            PS       1451    ...                  NaN         0   \n",
        "2         849            PS       1451    ...                  NaN         0   \n",
        "3         849            PS       1451    ...                  NaN         0   \n",
        "4         849            PS       1451    ...                  NaN         0   \n",
        "\n",
        "   CarrierDelay  WeatherDelay  NASDelay  SecurityDelay LateAircraftDelay  \\\n",
        "0           NaN           NaN       NaN            NaN               NaN   \n",
        "1           NaN           NaN       NaN            NaN               NaN   \n",
        "2           NaN           NaN       NaN            NaN               NaN   \n",
        "3           NaN           NaN       NaN            NaN               NaN   \n",
        "4           NaN           NaN       NaN            NaN               NaN   \n",
        "\n",
        "  IsArrDelayed  IsDepDelayed   RandNum  \n",
        "0          YES           YES  0.193944  \n",
        "1          YES            NO  0.466327  \n",
        "2          YES           YES  0.943457  \n",
        "3           NO            NO  0.232673  \n",
        "4          YES           YES  0.133799  \n",
        "\n",
        "[5 rows x 32 columns]\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "air_mapped = DataFrame()\n",
      "\n",
      "air_mapped['RandNum'] = air_raw['RandNum']\n",
      "\n",
      "air_mapped['IsDepDelayed'] = air_raw['IsDepDelayed']\n",
      "air_mapped['IsDepDelayedInt'] = air_mapped.apply(lambda row:\n",
      "                                                 1 if row['IsDepDelayed'] == 'YES' else 0,\n",
      "                                                 axis=1)\n",
      "del air_mapped['IsDepDelayed']\n",
      "print(air_mapped.shape)\n",
      "\n",
      "lb_origin = sklearn.preprocessing.LabelBinarizer()\n",
      "lb_origin.fit(air_raw['Origin'])\n",
      "tmp_origin = lb_origin.transform(air_raw['Origin'])\n",
      "tmp_origin_df = DataFrame(tmp_origin)\n",
      "print(tmp_origin_df.shape)\n",
      "\n",
      "lb_dest = sklearn.preprocessing.LabelBinarizer()\n",
      "lb_dest.fit(air_raw['Dest'])\n",
      "tmp_dest = lb_origin.transform(air_raw['Dest'])\n",
      "tmp_dest_df = DataFrame(tmp_dest)\n",
      "print(tmp_dest_df.shape)\n",
      "\n",
      "lb_uniquecarrier = sklearn.preprocessing.LabelBinarizer()\n",
      "lb_uniquecarrier.fit(air_raw['UniqueCarrier'])\n",
      "tmp_uniquecarrier = lb_origin.transform(air_raw['UniqueCarrier'])\n",
      "tmp_uniquecarrier_df = DataFrame(tmp_uniquecarrier)\n",
      "print(tmp_uniquecarrier_df.shape)\n",
      "\n",
      "air_mapped = pd.concat([\n",
      "                        air_mapped, \n",
      "                        tmp_origin_df, \n",
      "                        tmp_dest_df, \n",
      "                        air_raw['Distance'],\n",
      "                        tmp_uniquecarrier_df, \n",
      "                        air_raw['Month'],\n",
      "                        air_raw['DayofMonth'],\n",
      "                        air_raw['DayOfWeek'],\n",
      "                        ],\n",
      "                       axis=1)\n",
      "print(air_mapped.shape)\n",
      "air_mapped\n",
      "\n",
      "air = air_mapped"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(999, 2)\n",
        "(999, 10)\n",
        "(999, 10)\n",
        "(999, 10)\n",
        "(999, 36)\n"
       ]
      }
     ],
     "prompt_number": 122
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "air_train = air.ix[air['RandNum'] <= 0.8]\n",
      "# air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)]\n",
      "air_test  = air.ix[air['RandNum'] > 0.9]\n",
      "\n",
      "print(air_train.shape)\n",
      "print(air_test.shape)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(824, 36)\n",
        "(91, 36)\n"
       ]
      }
     ],
     "prompt_number": 124
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train = air_train.copy(deep=True)\n",
      "del X_train['RandNum']\n",
      "del X_train['IsDepDelayedInt']\n",
      "print(list(X_train.columns.values))\n",
      "print(X_train.shape)\n",
      "\n",
      "y_train = air_train['IsDepDelayedInt']\n",
      "print(y_train.shape)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']\n",
        "(824, 34)\n",
        "(824,)\n"
       ]
      }
     ],
     "prompt_number": 128
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf = GradientBoostingClassifier(n_estimators = 10, max_depth = 3, learning_rate = 0.01)\n",
      "clf.fit(X_train, y_train)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 132,
       "text": [
        "GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',\n",
        "              max_depth=3, max_features=None, max_leaf_nodes=None,\n",
        "              min_samples_leaf=1, min_samples_split=2, n_estimators=10,\n",
        "              random_state=None, subsample=1.0, verbose=0,\n",
        "              warm_start=False)"
       ]
      }
     ],
     "prompt_number": 132
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_test = air_test.copy(deep=True)\n",
      "del X_test['RandNum']\n",
      "del X_test['IsDepDelayedInt']\n",
      "print(list(X_test.columns.values))\n",
      "print(X_test.shape)\n",
      "\n",
      "print(\"\")\n",
      "print(\"--- PREDICTIONS ---\")\n",
      "print(\"\")\n",
      "pred = clf.predict(X_test)\n",
      "print(pred)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']\n",
        "(91, 34)\n",
        "\n",
        "--- PREDICTIONS ---\n",
        "\n",
        "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
        " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
        " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n"
       ]
      }
     ],
     "prompt_number": 137
    }
   ],
   "metadata": {}
  }
 ]
}